import pandas as pd
import pickle as pk
from tqdm import tqdm
import json
import glob
from itertools import product
import numpy as np

import sys
args = sys.argv
c = int(args[1])

df = pd.DataFrame()

# For each batch, load narratives and sentences; then merge them
for batch_id in range(0,29):
    print('At batch:', batch_id)
    sentences_batch = pd.read_csv('../data/gpo_split_sentences_by_batch/split_sentences_{0}.csv'.format(batch_id))
    sentences_batch.rename(columns={'sentence': 'sentence_raw'}, inplace=True)
    sentences_batch['sentence'] = range(0,len(sentences_batch))
    narratives_batch = pd.read_csv('../data/gpo_narratives/narratives_{0}_{1}.csv'.format(batch_id, c))
    narratives_batch = narratives_batch.merge(sentences_batch[['sentence', 'sentence_raw']], on='sentence', how='left')
    narratives_batch['batch_id'] = batch_id
    # Replace missing ARG1 with ARG2 (where available)
    narratives_batch['ARG1'].fillna(narratives_batch['ARG2'], inplace=True)
    narratives_batch['ARG1-RAW'].fillna(narratives_batch['ARG2-RAW'], inplace=True)
    # Replace verbs by not where there is a negation
    narratives_batch['B-V-RAW'] = np.where(narratives_batch['B-ARGM-NEG-RAW'] == True, 'not-' + narratives_batch['B-V-RAW'], narratives_batch['B-V-RAW'])
    df = df.append(narratives_batch)

# Create GPO id from document name
print('Getting GPO IDs...')
df['gpo_id'] = [int(d.split('_')[-1].split('.')[0]) for d in list(df.doc)]

# Save all narratives
df.to_csv('../data/gpo_final_data/narratives_all_with_metadata_{0}_no_frequency_filter.csv'.format(c), index=False)

# Drop incomplete narratives
df = df.dropna(subset=['ARGO', 'ARG1', 'B-V-RAW'])

# Load and merge metadata
metadata = pd.read_csv('../data/metadata/metadata_gpo.csv')
metadata = metadata[['birthday', 'chamber', 'first_name', 'gender', 'gpo_id', 'last_name', 'party', 'state', 'tag', 'tenure_end', 'tenure_start', 'wikipedia_id']]
df = df.merge(metadata, on='gpo_id', how='left', indicator=True)

# Concatenate arguments to narratives
df['narrative'] = df['ARGO'] + ' ' + df['B-V-RAW'] + ' ' + df['ARG1']

# Add information on which Congress
df['date'] = [i.split('/')[-1].split('.')[0].split('_')[0] for i in list(df.doc)]
df.drop(columns=['_merge'], inplace=True)
congress = pd.read_csv('../data/metadata/crosswalk_date_congress.csv')
df = df.merge(congress, on='date', how='left', indicator=True)
df.drop(columns=['_merge'], inplace=True)

# Calculate total frequency
df['n'] = 1
df['frequency'] = df['n'].groupby(df['narrative']).transform('sum')

df.to_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}_no_frequency_filter.csv'.format(c), index=False)

# Filter for total frequency
df = df[df.frequency > 50]

# Save intermediate results
df.to_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}.csv'.format(c), index=False)
